Load libraries
library(dplyr)
library(readr)
library(ggplot2)
library(maps)
library(reshape2)
library(tidyr)
library(purrr)
library(lubridate)
library(maptools)
library(plotly)
Load data
df <- read_csv('./data/database.csv')
## Parsed with column specification:
## cols(
## .default = col_double(),
## Date = col_character(),
## Time = col_time(format = ""),
## Type = col_character(),
## `Magnitude Type` = col_character(),
## ID = col_character(),
## Source = col_character(),
## `Location Source` = col_character(),
## `Magnitude Source` = col_character(),
## Status = col_character()
## )
## See spec(...) for full column specifications.
## Warning: 3 parsing failures.
## row col expected actual file
## 3379 Time time like 1975-02-23T02:58:41.000Z './data/database.csv'
## 7513 Time time like 1985-04-28T02:53:41.530Z './data/database.csv'
## 20651 Time time like 2011-03-13T02:23:34.520Z './data/database.csv'
df_filtered <- df %>%
filter(!is.na(Time))
df_filtered <- as.data.frame(df_filtered)
Replace column names
colnames(df_filtered) <- c('date', 'time', 'lat', 'lon', 'type', 'depth', 'depth_er', 'depth_ss', 'magnit', 'magnit_type', 'magnit_er', 'magnit_ss', 'azim', 'horiz_dist', 'horiz_er', 'rms', 'id', 'source', 'loc_source', 'magnit_source', 'status')
Convert date to year/month/day format
df_filtered <- df_filtered %>%
mutate(date = as_date(parse_date_time(date, orders=c('mdy'))))
# add columns for year, month and day
df_filtered <- df_filtered %>%
mutate(year = year(date), month = factor(month(date)), day = factor(day(date)))
Percentage of missing values in each column
round(100*colMeans(is.na(df_filtered)),2)
## date time lat lon type
## 0.00 0.00 0.00 0.00 0.00
## depth depth_er depth_ss magnit magnit_type
## 0.00 80.95 69.69 0.00 0.01
## magnit_er magnit_ss azim horiz_dist horiz_er
## 98.60 89.05 68.82 93.15 95.06
## rms id source loc_source magnit_source
## 25.88 0.00 0.00 0.00 0.00
## status year month day
## 0.00 0.00 0.00 0.00
Drop columns
df_filtered <-df_filtered %>%
select(-c(depth_er,magnit_er,azim,horiz_er,
horiz_dist,magnit_type,magnit_ss,depth_ss,id))
Replace RMS missing values with the mean
df_filtered <- df_filtered %>%
mutate(rms=replace(rms, is.na(rms), mean(rms)))
Missing value count
colSums(is.na(df_filtered))
## date time lat lon type
## 0 0 0 0 0
## depth magnit rms source loc_source
## 0 0 6059 0 0
## magnit_source status year month day
## 0 0 0 0 0
Count number of unique values in each column
rapply(df_filtered, function(x) length(unique(x)))
## date time lat lon type
## 12398 20469 20673 21472 4
## depth magnit rms source loc_source
## 3485 64 191 13 48
## magnit_source status year month day
## 24 2 52 12 31
unique(df_filtered$type)
## [1] "Earthquake" "Nuclear Explosion" "Explosion"
## [4] "Rock Burst"
unique(df_filtered$status)
## [1] "Automatic" "Reviewed"
unique(df_filtered$magnit_source)
## [1] "ISCGEM" "OFFICIAL" "CI" "US" "1020" "BRK"
## [7] "NC" "1000" "GCMT" "1009" "UW" "1023"
## [13] "ATLAS" "HRV" "PAR" "NIED" "NN" "SE"
## [19] "PGC" "US_GCMT" "US_PGC" "AK" "PR" "GUC"
unique(df_filtered$loc_source)
## [1] "ISCGEM" "CI" "US" "H" "U" "G" "NC"
## [8] "B" "GCMT" "AG" "UW" "SPE" "HVO" "BRK"
## [15] "ATLAS" "AGS" "PGC" "BOU" "SLC" "OTT" "AEI"
## [22] "AEIC" "CASC" "ISK" "ATH" "THE" "ROM" "MDD"
## [29] "WEL" "GUC" "UNM" "CSEM" "RSPR" "JMA" "NN"
## [36] "CAR" "SJA" "TEH" "BEO" "UCR" "SE" "TUL"
## [43] "TAP" "THR" "LIM" "US_WEL" "AK" "PR"
unique(df_filtered$source)
## [1] "ISCGEM" "ISCGEMSUP" "OFFICIAL" "CI" "US"
## [6] "NC" "GCMT" "UW" "ATLAS" "NN"
## [11] "SE" "AK" "PR"
Store categorical variables as factors
categ_var <- c('type', 'status', 'magnit_source', 'loc_source', 'source')
df_filtered[,categ_var] <- lapply(df_filtered[,categ_var],factor)
Count observations for each type
df_filtered %>%
group_by(type) %>%
summarise(no_rows = length(type))
## # A tibble: 4 x 2
## type no_rows
## <fct> <int>
## 1 Earthquake 23229
## 2 Explosion 4
## 3 Nuclear Explosion 175
## 4 Rock Burst 1
There are very few explosions and rock bursts so we can remove those observations.
df_filtered <- df_filtered %>%
filter(grepl('Earthquake|Nuclear', type))
We want the variables to contribute equally to the analysis, so we need to make sure that they are on the same scale. So we normalize magnitude, depth and rms
normalize <- function(x){
return((x - min(x)) / (max(x) - min(x)))
}
df_norm <- df_filtered
df_norm[c('magnit','depth','rms')] <- lapply(df_filtered[c('magnit','depth','rms')], normalize)
Magnitude classes
df_filtered <- df_filtered %>%
mutate(class = factor(cut(
df_filtered$magnit,
breaks = c(5, 6, 7, 8, Inf),
labels = c("moderate", "strong", "major", "great"),
right = FALSE)
)
)
Histograms for numeric variables
df_filtered %>%
keep(is.numeric) %>%
gather() %>%
ggplot(aes(value))+
facet_wrap(~key, scales='free')+
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 6057 rows containing non-finite values (stat_bin).
Histogram of earthquakes
df_filtered %>%
ggplot(aes(x=magnit))+
geom_histogram(binwidth=0.5)
Count earthquakes for each bin
df_filtered %>%
count(cut_width(magnit,0.5))
## # A tibble: 8 x 2
## `cut_width(magnit, 0.5)` n
## <fct> <int>
## 1 [5.25,5.75] 11748
## 2 (5.75,6.25] 8009
## 3 (6.25,6.75] 2486
## 4 (6.75,7.25] 816
## 5 (7.25,7.75] 253
## 6 (7.75,8.25] 79
## 7 (8.25,8.75] 10
## 8 (8.75,9.25] 3
Number of earthquakes for each magnitude class
df_filtered %>%
group_by(class) %>%
summarise(events = n()) %>%
ggplot(aes(x = class, y = events))+
geom_bar(stat = 'identity')
Lets zoom in and focus only on moderate and strong earthquakes
df_filtered %>%
filter(magnit < 7) %>%
ggplot(aes(x=magnit))+
geom_histogram(binwidth = 0.1)
df_filtered %>%
ggplot(aes(x=depth,colour=class, y=..density..))+
geom_freqpoly(binwidth=20)
Plot of earthquakes by type
ggplot(data = df_filtered) +
borders('world', colour='gray50',fill='gray50') +
geom_point(aes(x=lon, y=lat, color = type), alpha = 0.3, size = 0.6)
Plot of earthquakes by magnitude
ggplot(data = df_filtered) +
borders('world', colour='gray50',fill='gray50') +
geom_point(aes(x=lon, y=lat, color = class), alpha = 0.3, size = 1)+
scale_colour_manual(values = c("yellow","red", "blue", "black"))
Plot magnitude by erruption type
ggplot(df_filtered, aes(x = magnit)) +
geom_density(aes(fill = type), alpha = 0.4)
{r} # ggplot(df_filtered, aes(x = depth, y = magnit)) + # geom_point(aes(colour = type, shape = type), alpha = 0.4) #Plot number of earthquakes per year
df_filtered %>%
filter(type == 'Earthquake') %>%
group_by(year) %>%
summarise(events = n()) %>%
ggplot(aes(x = year, y = events))+
geom_bar(stat = 'identity') +
theme_minimal()
Plot average depth based on year
df_filtered %>%
filter(type == 'Earthquake') %>%
group_by(year) %>%
summarise(mean_depth = mean(depth)) %>%
ggplot(aes(x = year, y = mean_depth))+
geom_bar(stat = 'identity') +
theme_minimal()
Plot average number of earthquakes for every month
df_filtered %>%
filter(type == 'Earthquake') %>%
group_by(month) %>%
summarise(events = n()) %>%
ggplot(aes(x = month, y = events))+
geom_bar(stat = 'identity') +
theme_minimal()
Plot average depth of earthquake based on month
df_filtered %>%
# filter(type == 'Earthquake') %>%
group_by(month) %>%
summarise(mean_depth = mean(depth)) %>%
ggplot(aes(x = month, y = mean_depth))+
geom_bar(stat = 'identity') +
theme_minimal()
Animation of earthquakes across years
df_filtered %>%
plot_ly(
x = ~lon,
y = ~lat,
frame = ~year,
type = 'scatter',
mode = 'markers',
alpha = 0.3
) %>%
layout(xaxis = list(range = c(-200, 200)),
yaxis = list(range = c(-100, 100)))